# -*- coding: utf-8 -*-
import re

import scrapy


class MuluSpider(scrapy.Spider):
    name = 'mulu'
    allowed_domains = ['www.taiwan.cn']
    start_urls = ['http://www.taiwan.cn/']

    def start_requests(self):
        for url in self.start_urls:
            headers = {
                'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}
            yield scrapy.Request(url, headers=headers)

    def parse(self, response):

        data_href = response.xpath('//a/@href').extract()
        new_data_list = []
        for data in data_href:
            if re.match('.*//.*/.*/.*', data) != None:
                new_data_list.append(data)
        for href in new_data_list:
            lianjie = href.split('/')[3]

            fo = open("1.txt", "a", newline=None, encoding="utf-8")
            if lianjie != "":
                print(lianjie)
                fo.write(lianjie + "\n")





